Renewable Energy Data EDA¶

Import to dataframes¶

In [ ]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import geopandas as gpd
import folium


# Postcode data (SSI = small scale installations, SGU = small generation unit, SWH = Solar water heating )
df_SSI_SGU_Hydro = pd.read_csv("data/PostCode Data/Postcode data for small-scale installations - SGU-Hydro.csv")
df_SSI_SGU_Solar = pd.read_csv("data/PostCode Data/Postcode data for small-scale installations - SGU-Solar.csv")
df_SSI_SGU_Wind = pd.read_csv("data/PostCode Data/Postcode data for small-scale installations - SGU-Wind.csv")
df_SSI_SWH_Air_source_heat_pump = pd.read_csv("data/PostCode Data/Postcode data for small-scale installations - SWH-Air-source-heat-pump.csv")
df_SSI_SWH_Solar = pd.read_csv("data/PostCode Data/Postcode data for small-scale installations - SWH-Solar.csv")

# Battery installations with SSI by state
df_battery_installations = pd.read_csv("data/PostCode Data/State data for battery installations with small-scale systems.csv")

# Australian postcode data reference dataset
df_AU_postcode_dataset = pd.read_excel("data/australian_postcodes.xlsx")

# Whole REC database (REC = renewable energy certificate)
df_REC_database = pd.read_csv("data/wholeRECdatabase_20230120_no_dups.csv",encoding='cp1252')
In [ ]:
df_list = [
df_SSI_SGU_Hydro,
df_SSI_SGU_Solar,
df_SSI_SGU_Wind,
df_SSI_SWH_Air_source_heat_pump,
df_SSI_SWH_Solar,
df_battery_installations,
df_AU_postcode_dataset,
df_REC_database
]

for idx, df in enumerate(df_list, start=1):
    print(f"DataFrame {idx} - Name: {df.name if hasattr(df, 'name') else 'Unnamed'}, Shape: {df.shape}")
DataFrame 1 - Name: Unnamed, Shape: (20, 7)
DataFrame 2 - Name: Unnamed, Shape: (2806, 41)
DataFrame 3 - Name: Unnamed, Shape: (253, 5)
DataFrame 4 - Name: Unnamed, Shape: (2610, 21)
DataFrame 5 - Name: Unnamed, Shape: (2964, 21)
DataFrame 6 - Name: Unnamed, Shape: (10, 10)
DataFrame 7 - Name: Unnamed, Shape: (18513, 33)
DataFrame 8 - Name: Unnamed, Shape: (6271416, 17)
In [ ]:
df_REC_database.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6271416 entries, 0 to 6271415
Data columns (total 17 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   Status                       object 
 1   Owner                        object 
 2   Accreditation_Code           object 
 3   Fuel_Source_Active           bool   
 4   Fuel_Source_Type             object 
 5   Fuel_Source_Display_Name     object 
 6   Generation_Year              int64  
 7   Status.1                     object 
 8   Start_Serial                 float64
 9   End_Serial                   float64
 10  State                        object 
 11  Owner_Name                   object 
 12  Creation_Year                int64  
 13  Range_ID                     int64  
 14  Creation_Date                object 
 15  Public_Registered_Person_ID  int64  
 16  Created_By                   object 
dtypes: bool(1), float64(2), int64(4), object(10)
memory usage: 771.5+ MB
In [ ]:
df_compare_systems = pd.DataFrame([], columns=['Type','Locations','System Quantity', 'KW'])
In [ ]:
sgu_hydro = df_SSI_SGU_Hydro["Small Unit Installation Postcode"].drop_duplicates()

postcodes = sgu_hydro.shape[0]
quantity = sum(df_SSI_SGU_Hydro["Previous Years (2001-2021) - Installation Quantity"])
kw = round(sum(df_SSI_SGU_Hydro["Previous Years (2001-2021) - SGU Output Rated Output In kW"]),2)

print(postcodes, "unique hydro postcodes")
print(quantity, "total hydro units")
print(kw , "total hydro kw")

hydro_df = pd.DataFrame([["Hydro",postcodes, quantity, kw]],columns=['Type', 'Locations','System Quantity', 'KW'])
df_compare_systems = pd.concat([df_compare_systems, hydro_df])
df_compare_systems
20 unique hydro postcodes
20 total hydro units
50.7 total hydro kw
Out[ ]:
Type Locations System Quantity KW
0 Hydro 20 20 50.7
In [ ]:
sgu_solar = df_SSI_SGU_Solar["Small Unit Installation Postcode"].drop_duplicates()

postcodes = sgu_solar.shape[0]
quantity = sum(df_SSI_SGU_Solar["Previous Years (2001-2021) - Installation Quantity"])
kw = round(sum(df_SSI_SGU_Solar["Previous Years (2001-2021) - SGU Output Rated Output In kW"]),2)

print(postcodes, "unique solar postcodes")
print(quantity, "total solar units")
print(kw , "total solar kw")

solar_df = pd.DataFrame([["Solar",postcodes, quantity, kw]],columns=['Type', 'Locations','System Quantity', 'KW'])
df_compare_systems = pd.concat([df_compare_systems, solar_df])
df_compare_systems
2806 unique solar postcodes
3073708 total solar units
16519611.88 total solar kw
Out[ ]:
Type Locations System Quantity KW
0 Hydro 20 20 50.70
0 Solar 2806 3073708 16519611.88
In [ ]:
sgu_wind = df_SSI_SGU_Wind["Small Unit Installation Postcode"].drop_duplicates()

postcodes = sgu_wind.shape[0]
quantity = sum(df_SSI_SGU_Wind["Previous Years (2001-2021) - Installation Quantity"])
kw = round(sum(df_SSI_SGU_Wind["Previous Years (2001-2021) - SGU Output Rated Output In kW"]),2)

print(postcodes, "unique wind postcodes")
print(quantity, "total wind units")
print(kw , "total wind kw")

wind_df = pd.DataFrame([["Wind",postcodes, quantity, kw]],columns=['Type', 'Locations','System Quantity', 'KW'])
df_compare_systems = pd.concat([df_compare_systems, wind_df])
df_compare_systems
253 unique wind postcodes
424 total wind units
1468.67 total wind kw
Out[ ]:
Type Locations System Quantity KW
0 Hydro 20 20 50.70
0 Solar 2806 3073708 16519611.88
0 Wind 253 424 1468.67
In [ ]:
swh_hp = df_SSI_SWH_Air_source_heat_pump["Small Unit Installation Postcode"].drop_duplicates()

postcodes = swh_hp.shape[0]
quantity = sum(df_SSI_SWH_Air_source_heat_pump["Previous Years (2001-2021) - Installation Quantity"])
kw = 0 # no kw column

print(postcodes, "unique heat-pump postcodes")
print(quantity, "total heat-pump units")
print(kw , "total heat-pump kw")

hp_df = pd.DataFrame([["Heat-pump",postcodes, quantity, kw]],columns=['Type', 'Locations','System Quantity', 'KW'])
df_compare_systems = pd.concat([df_compare_systems, hp_df])
df_compare_systems
2610 unique heat-pump postcodes
365635 total heat-pump units
0 total heat-pump kw
Out[ ]:
Type Locations System Quantity KW
0 Hydro 20 20 50.70
0 Solar 2806 3073708 16519611.88
0 Wind 253 424 1468.67
0 Heat-pump 2610 365635 0.00
In [ ]:
swh_solar = df_SSI_SWH_Solar["Small Unit Installation Postcode"].drop_duplicates()

postcodes = swh_solar.shape[0]
quantity = sum(df_SSI_SWH_Solar["Previous Years (2001-2021) - Installation Quantity"])
kw = 0 # no kw column

print(postcodes, "unique swh-solar postcodes")
print(quantity, "total swh-solar units")
print(kw , "total swh-solar kw")

swh_solar_df = pd.DataFrame([["SWH-Solar",postcodes, quantity, kw]],columns=['Type', 'Locations','System Quantity', 'KW'])
df_compare_systems = pd.concat([df_compare_systems, swh_solar_df])
df_compare_systems
2964 unique swh-solar postcodes
1036390 total swh-solar units
0 total swh-solar kw
Out[ ]:
Type Locations System Quantity KW
0 Hydro 20 20 50.70
0 Solar 2806 3073708 16519611.88
0 Wind 253 424 1468.67
0 Heat-pump 2610 365635 0.00
0 SWH-Solar 2964 1036390 0.00
In [ ]:
df_compare_systems[["Locations","System Quantity","KW"]] = df_compare_systems[["Locations","System Quantity","KW"]].apply(pd.to_numeric)
Locations_graph = df_compare_systems.plot.bar(x='Type', y='Locations', rot=0)
System_Quantity_graph = df_compare_systems.plot.bar(x='Type', y='System Quantity', rot=0)
kW_graph = df_compare_systems.plot.bar(x='Type', y='KW', rot=0)

From this, there are two questions:¶

1- Do swh units not produce KW or is it simply not included in the datset?

2- Is the observed KW value already calculated based on the system quantity, or should the two be multiplied to find the overall kw production per postcode?

The main observations¶

1- Most locations that have solar panels will also have a solar water heating which makes intuitive sense.

2- In a given household you may have 1 heat pump and 1 SWH-solar but many panels which explains the quantity difference.

3- It's very rare for residential properties to have wind or hydro which explains the low numbers compared to solar.

4- Between 2001 and 2021 (The last 20 years), Australian residents installed around 16 Million kW of solar energy, 1500kW of Wind energy, and 50kW of hydro energy.

In [ ]:
# df_SSI_SGU_Hydro,
# df_SSI_SGU_Solar,
# df_SSI_SGU_Wind,
# df_SSI_SWH_Air_source_heat_pump,
# df_SSI_SWH_Solar,
# df_battery_installations,
# df_AU_postcode_dataset,

#1) individual dataset trends
#2) Geomap by psotcode
#3) Geomap & barchart by state with battery data included

Creating a combined postcode dataset

In [ ]:
swh_solar_df = pd.DataFrame([["SWH-Solar",postcodes, quantity, kw]],columns=['Type', 'Locations','System Quantity', 'KW'])

df_compare_systems = pd.concat([df_compare_systems, swh_solar_df])

df_SSI_SGU_Hydro_small = df_SSI_SGU_Hydro[['Small Unit Installation Postcode','Previous Years (2001-2021) - Installation Quantity','Previous Years (2001-2021) - SGU Output Rated Output In kW']]
df_SSI_SGU_Solar_small = df_SSI_SGU_Solar[['Small Unit Installation Postcode','Previous Years (2001-2021) - Installation Quantity','Previous Years (2001-2021) - SGU Output Rated Output In kW']]
df_SSI_SGU_Wind_small = df_SSI_SGU_Wind[['Small Unit Installation Postcode','Previous Years (2001-2021) - Installation Quantity','Previous Years (2001-2021) - SGU Output Rated Output In kW']]
df_SSI_SWH_Air_source_heat_pump_small = df_SSI_SWH_Air_source_heat_pump[['Small Unit Installation Postcode','Previous Years (2001-2021) - Installation Quantity']]
df_SSI_SWH_Solar_small = df_SSI_SWH_Solar[['Small Unit Installation Postcode','Previous Years (2001-2021) - Installation Quantity']]

df_SSI_SGU_Hydro_small2= df_SSI_SGU_Hydro_small.copy(deep=True)
df_SSI_SGU_Solar_small2= df_SSI_SGU_Solar_small.copy(deep=True)
df_SSI_SGU_Wind_small2= df_SSI_SGU_Wind_small.copy(deep=True)
df_SSI_SWH_Air_source_heat_pump_small2= df_SSI_SWH_Air_source_heat_pump_small.copy(deep=True)
df_SSI_SWH_Solar_small2= df_SSI_SWH_Solar_small.copy(deep=True)

df_SSI_SWH_Air_source_heat_pump_small2["Previous Years (2001-2021) - SGU Output Rated Output In kW"] = None
df_SSI_SWH_Solar_small2["Previous Years (2001-2021) - SGU Output Rated Output In kW"] = None


df_SSI_SGU_Hydro_small2['Type'] = "Hydro"
df_SSI_SGU_Solar_small2['Type'] ="Solar"
df_SSI_SGU_Wind_small2['Type'] = "Wind"
df_SSI_SWH_Air_source_heat_pump_small2['Type'] = "SWG- Heat Pump"
df_SSI_SWH_Solar_small2['Type'] = "SWH Solar"

df_systems_combined = pd.concat([df_SSI_SGU_Hydro_small2,df_SSI_SGU_Solar_small2,df_SSI_SGU_Wind_small2,df_SSI_SWH_Air_source_heat_pump_small2,df_SSI_SWH_Solar_small2])

Cleaning up the Australian postcode reference dataset

In [ ]:
df_AU_postcode_dataset_small = df_AU_postcode_dataset[['Postcode','State','Long','Lat','LGA Region','Electorate Rating']]
df_AU_postcode_dataset_small = df_AU_postcode_dataset_small.drop_duplicates(subset='Postcode', keep="first")
df_AU_postcode_dataset_small
Out[ ]:
Postcode State Long Lat LGA Region Electorate Rating
0 200 ACT 149.119000 -35.277700 Unincorporated ACT NaN
2 800 NT 130.836680 -12.458684 Darwin Waterfront Precinct Inner Metropolitan
4 801 NT 130.836680 -12.458684 Darwin Waterfront Precinct Rural
5 803 NT 0.000000 0.000000 NaN NaN
6 804 NT 130.873315 -12.428017 Darwin NaN
... ... ... ... ... ... ...
18508 9013 QLD 152.823141 -27.603479 Ipswich NaN
18509 9015 QLD 152.823141 -27.603479 Ipswich NaN
18510 9464 QLD 153.074982 -27.397055 Brisbane NaN
18511 9726 QLD 153.412197 -28.008783 Gold Coast NaN
18512 9999 VIC 144.956776 -37.817403 Melbourne NaN

3173 rows × 6 columns

Merge in the postcode reference data with combined postcode dataset

In [ ]:
df_systems_combined_extra = df_systems_combined.merge(df_AU_postcode_dataset_small, how='left', left_on='Small Unit Installation Postcode', right_on='Postcode')

df_systems_combined_extra.isnull().sum()
Out[ ]:
Small Unit Installation Postcode                                 0
Previous Years (2001-2021) - Installation Quantity               0
Previous Years (2001-2021) - SGU Output Rated Output In kW    5574
Type                                                             0
Postcode                                                       217
State                                                          217
Long                                                           217
Lat                                                            217
LGA Region                                                     218
Electorate Rating                                             1021
dtype: int64

We expect kW output to have null values but not other merged columns which all suspiciously have 217 null values

In [ ]:
df_systems_combined_extra[df_systems_combined_extra.Postcode.isnull()]
Out[ ]:
Small Unit Installation Postcode Previous Years (2001-2021) - Installation Quantity Previous Years (2001-2021) - SGU Output Rated Output In kW Type Postcode State Long Lat LGA Region Electorate Rating
20 0 4 4.46 Solar NaN NaN NaN NaN NaN NaN
101 1848 1 1.04 Solar NaN NaN NaN NaN NaN NaN
510 2552 1 0.56 Solar NaN NaN NaN NaN NaN NaN
1502 4062 1 1.50 Solar NaN NaN NaN NaN NaN NaN
1503 4063 1 1.25 Solar NaN NaN NaN NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ...
8645 7919 1 NaN SWH Solar NaN NaN NaN NaN NaN NaN
8646 7920 1 NaN SWH Solar NaN NaN NaN NaN NaN NaN
8648 8084 1 NaN SWH Solar NaN NaN NaN NaN NaN NaN
8650 9072 1 NaN SWH Solar NaN NaN NaN NaN NaN NaN
8651 9156 1 NaN SWH Solar NaN NaN NaN NaN NaN NaN

217 rows × 10 columns

After inspecting these 217 rows, we find that they don't correspond to legitimate postcodes and seem to be a data mistake caused by the author of the dataset, likely by prefilling consecutive postcode values.

Therefore we can remove these 217 rows

In [ ]:
df_systems_combined_cleaned = df_systems_combined_extra[~df_systems_combined_extra.Postcode.isnull()]
df_systems_combined_cleaned
Out[ ]:
Small Unit Installation Postcode Previous Years (2001-2021) - Installation Quantity Previous Years (2001-2021) - SGU Output Rated Output In kW Type Postcode State Long Lat LGA Region Electorate Rating
0 837 1 4.00 Hydro 837.0 NT 131.016647 -12.704767 Litchfield Rural
1 2484 1 1.90 Hydro 2484.0 NSW 153.316480 -28.380590 Tweed Rural
2 2527 1 4.00 Hydro 2527.0 NSW 150.754592 -34.583539 Shellharbour Provincial
3 3409 1 1.05 Hydro 3409.0 VIC 141.863237 -36.724843 Horsham Rural
4 3691 1 1.00 Hydro 3691.0 VIC 147.880991 -36.027154 Towong Rural
... ... ... ... ... ... ... ... ... ... ...
8642 7469 2 NaN SWH Solar 7469.0 TAS 145.201632 -41.784391 Waratah-Wynyard Rural
8643 7470 3 NaN SWH Solar 7470.0 TAS 145.500566 -41.902129 West Coast Rural
8647 8002 1 NaN SWH Solar 8002.0 VIC 144.982207 -37.818517 Yarra NaN
8649 9000 1 NaN SWH Solar 9000.0 QLD 152.823141 -27.603479 Ipswich NaN
8652 9726 2 NaN SWH Solar 9726.0 QLD 153.412197 -28.008783 Gold Coast NaN

8436 rows × 10 columns

In [ ]:
#df_systems_combined_cleaned)

#world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
# aus = world[world.name == 'Australia']

# gdf = gpd.GeoDataFrame(df_systems_combined_cleaned, geometry=gpd.points_from_xy(df_systems_combined_cleaned.Lat, df_systems_combined_cleaned.Long))

# fig, ax = plt.subplots(figsize=(10, 8))
# aus.plot(ax=ax, color='lightgrey')
# gdf.plot(ax=ax, marker='o', color='blue', markersize=50)
# plt.title('Geospatial Visualization of Australian Postcodes')
# plt.show()

m = folium.Map(location=[-38.2744, 140.7751], zoom_start=4)

# from https://www.linkedin.com/pulse/mapping-australian-geograph-data-python-dilan-jayasekara/
for index, row in df_systems_combined_cleaned.iterrows():
    # if row['Previous Years (2001-2021) - Installation Quantity'] >= 1:
    #     marker_color = 'darkred'
    #     fill_color = 'darkred'
    # elif row['Previous Years (2001-2021) - Installation Quantity'] >= 100:
    #     marker_color = 'darkred'
    #     fill_color = 'red'
    # elif row['Previous Years (2001-2021) - Installation Quantity'] >= 50:
    #     marker_color = 'darkred'
    #     fill_color = 'lightred'
    # elif row['Previous Years (2001-2021) - Installation Quantity'] >= 10:
    #     marker_color = 'darkred'
    #     fill_color = 'orange'
    # else:
    #     marker_color='darkred'
    #     fill_color = 'darkpurple'
        
    folium.Circle(
          location=[row['Lat'], row['Long']],
          popup= 'Postcode:' +str(row['Postcode']),
          tooltip=row['Postcode'],
          #radius=row['Previous Years (2001-2021) - Installation Quantity']*10,
          radius= 4000,
          #color=marker_color,
          color='darkred',
          fill=True,
          #fill_color=fill_color,
          fill_color='green'
       ).add_to(m)
m
Out[ ]:
Make this Notebook Trusted to load map: File -> Trust Notebook